{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jsonlines\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_jsonl(file_paths):\n",
    "    content_id = 0\n",
    "    preprocessed_data = []\n",
    "    data = []\n",
    "    for file_path in file_paths:\n",
    "        with open(file_path, \"r\") as f:\n",
    "            for item in jsonlines.Reader(f):\n",
    "                for section in item[\"sections\"]:\n",
    "                    start = section[0]\n",
    "                    end = section[1]\n",
    "                    content = \" \".join(item[\"words\"][start:end])\n",
    "                    preprocessed_data.append({\"id\": \"scirex-{:0>4d}\".format(content_id), \"content\": content})\n",
    "                    content_id += 1\n",
    "                for relation in item[\"n_ary_relations\"]:\n",
    "                    Material = relation[\"Material\"]\n",
    "                    Method = relation[\"Method\"]\n",
    "                    Metric = relation[\"Metric\"]\n",
    "                    Task = relation[\"Task\"]\n",
    "                    score = relation[\"score\"]\n",
    "                    data.append({\"Material\": Material, \"Method\": Method, \"Metric\": Metric, \"Task\": Task, \"score\": score})\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_paths = [\"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/train.jsonl\", \n",
    "              \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/dev.jsonl\", \n",
    "              \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/test.jsonl\"]\n",
    "data = read_jsonl(file_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2148\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "for i in range(len(data)):\n",
    "    if type(data[i][\"score\"]) == str:\n",
    "        # print(data[i])\n",
    "        if data[i][\"score\"] == \"---\":\n",
    "            data[i][\"score\"] = \"-\"\n",
    "        if \"\\ufeff\" in data[i][\"score\"]:\n",
    "            data[i][\"score\"] = data[i][\"score\"].replace(\"\\ufeff\", \"\")\n",
    "        if \",\" in data[i][\"score\"]:\n",
    "            data[i][\"score\"] = data[i][\"score\"].replace(\",\", \".\")\n",
    "        if not \"±\" in data[i][\"score\"] and not \"/\" in data[i][\"score\"] and not data[i][\"score\"] == \"-\" and not \"%\" in data[i][\"score\"] and not \"m\" in data[i][\"score\"] and not \"M\" in data[i][\"score\"] and not \"k\" in data[i][\"score\"] and not \"K\" in data[i][\"score\"] and not data[i][\"score\"] == \"\" and not \"B\" in data[i][\"score\"]:\n",
    "            data[i][\"score\"] = float(data[i][\"score\"])\n",
    "df = pd.DataFrame(data)\n",
    "print(len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0         2.1\n",
      "1       71.8%\n",
      "2       469.0\n",
      "3       71.8%\n",
      "4       0.869\n",
      "5        76.9\n",
      "6        79.6\n",
      "7       10.64\n",
      "8       23.96\n",
      "9       33.45\n",
      "10      15.69\n",
      "11      15.68\n",
      "12      36.74\n",
      "13       25.9\n",
      "14       28.5\n",
      "15     73.744\n",
      "16     67.974\n",
      "17     81.525\n",
      "18     77.323\n",
      "19       85.1\n",
      "20       86.7\n",
      "21       190k\n",
      "22     5185.0\n",
      "23     4650.0\n",
      "24    5.2667%\n",
      "25       0.95\n",
      "26       0.88\n",
      "27      0.778\n",
      "28      0.686\n",
      "29      0.881\n",
      "30      0.956\n",
      "31      0.819\n",
      "32      0.627\n",
      "33      0.498\n",
      "34      80.9%\n",
      "35      95.6%\n",
      "36      71.18\n",
      "37      69.21\n",
      "38      61.41\n",
      "39      69.73\n",
      "40      65.02\n",
      "41      57.19\n",
      "42       63.1\n",
      "43       58.2\n",
      "44      68.07\n",
      "45      68.16\n",
      "46       8.59\n",
      "47       55.8\n",
      "48       27.0\n",
      "49      507.0\n",
      "Name: score, dtype: object <class 'pandas.core.series.Series'>\n"
     ]
    }
   ],
   "source": [
    "print(df.iloc[:50][\"score\"], type(df.iloc[:50][\"score\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "questions = []\n",
    "question_id = 0\n",
    "answers1 = []\n",
    "answers2 = []\n",
    "answers3 = []\n",
    "answers4 = []\n",
    "while question_id < 100:\n",
    "    random_indices = random.randint(0, len(df)-1)\n",
    "    row = df.iloc[random_indices]\n",
    "    Material = row[\"Material\"]\n",
    "    Method = row[\"Method\"]\n",
    "    Metric = row[\"Metric\"]\n",
    "    Task = row[\"Task\"]\n",
    "    score = row[\"score\"]\n",
    "    subdata_no_Material = df.loc[(df[\"Method\"] == Method) & (df[\"Metric\"] == Metric) & (df[\"Task\"] == Task)]\n",
    "    subdata_no_Method = df.loc[(df[\"Material\"] == Material) & (df[\"Metric\"] == Metric) & (df[\"Task\"] == Task)]\n",
    "    subdata_no_Metric = df.loc[(df[\"Material\"] == Material) & (df[\"Method\"] == Method) & (df[\"Task\"] == Task)]\n",
    "    subdata_no_Task = df.loc[(df[\"Material\"] == Material) & (df[\"Method\"] == Method) & (df[\"Metric\"] == Metric)]\n",
    "    question_indices = random.randint(0, 3)\n",
    "    if question_indices == 0 and len(subdata_no_Material) != 1:\n",
    "        question = \"On which dataset does the {} method achieve the highest {} score for {} task?\".format(Method, Metric, Task)\n",
    "        # print(subdata_no_Material)\n",
    "        subdata_no_Material = subdata_no_Material.sort_values(by=[\"score\"], ascending=False)\n",
    "        answer = subdata_no_Material.iloc[0][\"Material\"]\n",
    "        # answers1.append(answer)\n",
    "    elif question_indices == 1 and len(subdata_no_Method) != 1:\n",
    "        question = \"Which method achieves the highest {} score on {} dataset for {} task?\".format(Metric, Material, Task)\n",
    "        subdata_no_Method = subdata_no_Method.sort_values(by=[\"score\"], ascending=False)\n",
    "        answer = subdata_no_Method.iloc[0][\"Method\"]\n",
    "        # answers2.append(answr)\n",
    "    elif question_indices == 2:\n",
    "        question = \"On what metrics is the {} method evaluated on {} dataset for {} task?\".format(Method, Material, Task)\n",
    "        metrics = subdata_no_Metric[\"Metric\"].unique()\n",
    "        answer = \", \".join(metrics)\n",
    "        # answers3.append(answer)\n",
    "    elif question_indices == 3:\n",
    "        question = \"Which datasets is {} method evaluated on for {} task?\".format(Method, Task)\n",
    "        materials = subdata_no_Task[\"Material\"].unique()\n",
    "        answer = \", \".join(materials)\n",
    "    else:\n",
    "        continue\n",
    "        # answers4.append(answer)\n",
    "    if (question_indices == 2 and answer not in answers3):\n",
    "        answers3.append(answer)\n",
    "        questions.append({\"qid\": \"hard-scirex-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "        question_id += 1\n",
    "    elif (question_indices == 3 and answer not in answers4):\n",
    "        answers4.append(answer)\n",
    "        questions.append({\"qid\": \"hard-scirex-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "        question_id += 1\n",
    "    elif (question_indices == 0 and answer not in answers1):\n",
    "        answers1.append(answer)\n",
    "        questions.append({\"qid\": \"hard-scirex-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "        question_id += 1\n",
    "    elif (question_indices == 1 and answer not in answers2):\n",
    "        answers2.append(answer)\n",
    "        questions.append({\"qid\": \"hard-scirex-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "        question_id += 1\n",
    "    else:\n",
    "        continue\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/scirex-hard.jsonl', mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
